#ifndef __DATASET_H_
#define __DATASET_H_

#include "util.h"
#include "suffixindex.h"
#include "lexicon.h"
#include "corpus.h"

// Prepare any files we need. Start with "data" and build "data.X"
// data.lexicon - the lexicon we use
// data.encoded - the data encoded using the lexicon
// data.suffixes - sorted (most of the) suffixes of the encoded data

void PrepareDataSet(string data_filename);

struct DataSet {
  void Load();
  // No destructor for now

  Corpus corpus_;
  Lexicon lexicon_;
  SuffixIndex index_;
  Memblock token_counts_;
};

#endif
